import numpy as np
from math import *
from env import dynamics

def soft_policy(Q_matrix,V_matrix,num_action):
  distribution=np.zeros((10,13,num_action))
  distribution=distribution.astype(np.object)
  for x in range(10):
    for y in range(13):
      for a in range(num_action):
        distribution[x][y][a]=exp(Q_matrix[x][y][a])/exp(V_matrix[x][y])
  return distribution

def soft_Q_matrix_function(gamma,reward_matrix,cost_matrix,V_matrix,num_action):
  Q_matrix=np.zeros((10,13,num_action))
  Q_matrix=Q_matrix.astype(np.object)
  for x in range(10):
    for y in range(13):
      for a in range(num_action):
        next_state=dynamics(np.mat([x,y]).T,np.mat([a]).T)
        value=V_matrix[next_state.item(0)][next_state.item(1)]
        Q_matrix[x][y][a]=reward_matrix[x,y]-cost_matrix[x,y]+gamma*value
  return Q_matrix

  
def soft_V_matrix_funciton(Q_matrix,num_action):
  V_matrix=np.zeros((10,13))
  V_matrix=V_matrix.astype(np.object)
  for x in range(10):
    for y in range(13):
      value=0.0
      for a in range(num_action):
        value=value+exp(Q_matrix[x][y][a])
      V_matrix[x][y]=log(value)
  return V_matrix

def calculate_soft_policy(omega,gamma,num_action):
  reward1_matrix=np.zeros((10,13))
  reward1_matrix=reward1_matrix.astype(np.object)
  reward1_matrix[9,12]=50

  reward2_matrix=np.zeros((10,13))
  reward2_matrix=reward2_matrix.astype(np.object)
  reward2_matrix[9,0]=50

  cost_matrix=100*omega

  soft_V1_matrix=np.zeros((10,13))
  soft_V1_matrix=soft_V1_matrix.astype(np.object)
  soft_Q1_matrix=np.copy(soft_Q_matrix_function(gamma,reward1_matrix,cost_matrix,soft_V1_matrix,num_action))
  new_soft_V1_matrix=np.copy(soft_V_matrix_funciton(soft_Q1_matrix,num_action))

  soft_V2_matrix=np.zeros((10,13))
  soft_V2_matrix=soft_V2_matrix.astype(np.object)
  soft_Q2_matrix=np.copy(soft_Q_matrix_function(gamma,reward2_matrix,cost_matrix,soft_V2_matrix,num_action))
  new_soft_V2_matrix=np.copy(soft_V_matrix_funciton(soft_Q2_matrix,num_action))
  
  max_value1=0.0
  max_value2=0.0

  for x in range(10):
    for y in range(13):
      if max_value1<abs(soft_V1_matrix[x][y]-new_soft_V1_matrix[x][y]):
        max_value1=abs(soft_V1_matrix[x][y]-new_soft_V1_matrix[x][y])
      if max_value2<abs(soft_V2_matrix[x][y]-new_soft_V2_matrix[x][y]):
        max_value2=abs(soft_V2_matrix[x][y]-new_soft_V2_matrix[x][y])

  while max_value1>0.1 or max_value2>0.1:
    print(max_value2)
    soft_V1_matrix=np.copy(new_soft_V1_matrix)
    soft_Q1_matrix=np.copy(soft_Q_matrix_function(gamma,reward1_matrix,cost_matrix,soft_V1_matrix,num_action))
    new_soft_V1_matrix=np.copy(soft_V_matrix_funciton(soft_Q1_matrix,num_action))

    soft_V2_matrix=np.copy(new_soft_V2_matrix)
    soft_Q2_matrix=np.copy(soft_Q_matrix_function(gamma,reward2_matrix,cost_matrix,soft_V2_matrix,num_action))
    new_soft_V2_matrix=np.copy(soft_V_matrix_funciton(soft_Q2_matrix,num_action))
    
    max_value1=0.0
    max_value2=0.0
    for x in range(10):
      for y in range(13):
        if max_value1<abs(soft_V1_matrix[x][y]-new_soft_V1_matrix[x][y]):
          max_value1=abs(soft_V1_matrix[x][y]-new_soft_V1_matrix[x][y])
        if max_value2<abs(soft_V2_matrix[x][y]-new_soft_V2_matrix[x][y]):
          max_value2=abs(soft_V2_matrix[x][y]-new_soft_V2_matrix[x][y])

  policy1=np.copy(soft_policy(soft_Q1_matrix,new_soft_V1_matrix,num_action))
  policy2=np.copy(soft_policy(soft_Q2_matrix,new_soft_V2_matrix,num_action))
  return policy1,policy2

def choose_action(policy_distribution):  # distribution is 4x1
  choice=np.random.uniform()
  sum_value=0.0
  for a in range(num_action):
    sum_value=sum_value+policy_distribution[a]
    if sum_value>=choice:
      return a

def trial(initial_state,policy1,policy2,num_action):
  trajectory=[]
  state=initial_state
  for i in range(35):
    policy1_distribution=policy1[state.item(0)][state.item(1)][:]
    action1=choose_action(policy1_distribution)
    next_state1=dynamics(state[0:2],np.mat([action1]).T)
    policy2_distribution=policy2[state.item(2)][state.item(3)][:]
    action2=choose_action(policy2_distribution)
    next_state2=dynamics(state[2:4],np.mat([action2]).T)
      
    trajectory.append([state.item(0),state.item(1),state.item(2),state.item(3),action1,action2])
    state=np.copy(np.vstack((next_state1,next_state2)))
  return trajectory

num_action=4
omega=np.zeros((10,13))
gamma=0.9
omega=omega.astype(np.object)
omega[0,:]=1.0
omega[2,1]=1.0
#omega[1,3]=1.0
omega[3,11]=1.0
omega[8,0]=1.0
omega[7,12]=1.0
omega[3:10,2:11]=1.0
initial_state=np.mat([9,0,9,12]).T
policy1,policy2=calculate_soft_policy(omega,gamma,num_action)

trajectory_file=open("expert_trajectory_file.txt","w")

num_trials=50
for i in range(num_trials):
  trajectory=np.copy(trial(initial_state,policy1,policy2,num_action))
  for entry in trajectory:
    np.savetxt(trajectory_file,entry)
trajectory_file.close()
print(trajectory)







